# 2410_iPAGE_SoilData_cleaned
import pandas as pd
from scipy import stats
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
import plotly
import plotly.graph_objs as go
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('2410_iPAGE_SoilData_cleaned.csv')
df.head()
| Area | Data Collection Year | soil group | Land class | knit (surface) | pH | SOC (%) | Nitrogen N (%) | Potassium K (meq/100) | Phosphorus P (ug/g) | Sulfur S (ug/g) | Boron B (ug/g) | Zinc Zn (ug/g) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Mithpukur | 2005 | belab | high ground | Clay loam | 5.0 | 1.27 | 0.08 | 0.15 | 19.6 | 37.7 | 0.26 | 0.86 |
| 1 | Mithpukur | 2005 | belab | high ground | Clay loam | 4.9 | 1.47 | 0.09 | 0.25 | 4.1 | 32.0 | 0.25 | 0.75 |
| 2 | Mithpukur | 2005 | belab | high ground | Clay loam | 4.6 | 1.07 | 0.05 | 0.09 | 13.3 | 13.5 | 0.27 | 0.95 |
| 3 | Mithpukur | 2005 | belab | high ground | Clay loam | 5.2 | 1.51 | 0.06 | 0.30 | 20.2 | 30.0 | 0.28 | 1.00 |
| 4 | Mithpukur | 2005 | belab | high ground | Clay loam | 5.3 | 1.08 | 0.11 | 0.17 | 20.5 | 27.8 | 0.30 | 1.04 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 618 entries, 0 to 617 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Area 618 non-null object 1 Data Collection Year 618 non-null int64 2 soil group 618 non-null object 3 Land class 618 non-null object 4 knit (surface) 461 non-null object 5 pH 618 non-null float64 6 SOC (%) 618 non-null float64 7 Nitrogen N (%) 618 non-null float64 8 Potassium K (meq/100) 618 non-null float64 9 Phosphorus P (ug/g) 618 non-null float64 10 Sulfur S (ug/g) 618 non-null float64 11 Boron B (ug/g) 618 non-null float64 12 Zinc Zn (ug/g) 618 non-null float64 dtypes: float64(8), int64(1), object(4) memory usage: 62.9+ KB
df.shape
(618, 13)
df.select_dtypes(include=['object']).columns
Index(['Area', 'soil group', 'Land class', 'knit (surface)'], dtype='object')
df.select_dtypes(include=['float64']).columns
Index(['pH', 'SOC (%)', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)'],
dtype='object')
# Check for missing values
missing_values = df.isnull().sum()
missing_values
Area 0 Data Collection Year 0 soil group 0 Land class 0 knit (surface) 157 pH 0 SOC (%) 0 Nitrogen N (%) 0 Potassium K (meq/100) 0 Phosphorus P (ug/g) 0 Sulfur S (ug/g) 0 Boron B (ug/g) 0 Zinc Zn (ug/g) 0 dtype: int64
# Check for duplicates
duplicates = df.duplicated()
print(duplicates.sum())
0
# Value counts for each categorical feature:
for column in df.select_dtypes(include=['object']):
print(f"Value counts for {column}:")
print(df[column].value_counts())
print(df[column].nunique())
print()
print("="*45)
Value counts for Area: Area Mithpukur 213 Pirgacha 149 Kaunia upazila 99 Taraganj Thana 97 Gangachara 60 Name: count, dtype: int64 5 ============================================= Value counts for soil group: soil group Gangachara 238 Palashbari 66 Kaunia 63 Alluvial soil of Teesta 39 Pirgacha 36 Berries 32 belab 20 the moon 18 Sahapur 16 Noadda 15 Teesta sandy soil 11 Imadpur 11 Berries 10 Saghata 8 recession 8 Kachna 6 Khilgaon 5 Bhimpur 5 The militants 4 Chilmari 2 Menanagar 2 the pen 1 Imadpur 1 Indrail 1 Name: count, dtype: int64 24 ============================================= Value counts for Land class: Land class Medium high land 319 high ground 268 Medium low land 25 Deep medium high land 3 Shallow to medium high land 3 Name: count, dtype: int64 5 ============================================= Value counts for knit (surface): knit (surface) loam 358 Clay loam 76 loam clay 15 brick 9 in the sand 3 Name: count, dtype: int64 5 =============================================
print(df["Data Collection Year"].value_counts())
print(df["Data Collection Year"].nunique())
Data Collection Year 2005 362 2016 99 1997 97 1990 60 Name: count, dtype: int64 4
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Data Collection Year | 618.0 | 2004.050162 | 7.103021 | 1990.00 | 1997.0000 | 2005.00 | 2005.0000 | 2016.00 |
| pH | 618.0 | 5.236942 | 0.469949 | 4.10 | 4.9000 | 5.20 | 5.4000 | 7.80 |
| SOC (%) | 618.0 | 1.792214 | 1.034159 | 0.10 | 1.3525 | 1.70 | 2.1600 | 20.40 |
| Nitrogen N (%) | 618.0 | 0.111832 | 0.507259 | 0.00 | 0.0700 | 0.09 | 0.1100 | 12.60 |
| Potassium K (meq/100) | 618.0 | 0.241246 | 0.581693 | 0.00 | 0.1100 | 0.17 | 0.2875 | 14.20 |
| Phosphorus P (ug/g) | 618.0 | 31.144256 | 38.078580 | 0.00 | 8.7850 | 16.00 | 37.7000 | 254.70 |
| Sulfur S (ug/g) | 618.0 | 15.340676 | 11.062096 | 0.40 | 8.7025 | 12.60 | 19.0000 | 85.00 |
| Boron B (ug/g) | 618.0 | 0.348997 | 0.258153 | 0.00 | 0.2300 | 0.30 | 0.4000 | 2.66 |
| Zinc Zn (ug/g) | 618.0 | 1.039191 | 0.976144 | 0.08 | 0.5200 | 0.82 | 1.1000 | 10.58 |
# Detecting outliers using IQR (Interquartile Range)
numeric_data = df.select_dtypes(include=['number'])
# Calculate the first and third quartile
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
# Calculate the interquartile range (IQR)
IQR = Q3 - Q1
# Detect outliers
outliers = ((numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))).sum()
print(outliers)
Data Collection Year 0 pH 25 SOC (%) 20 Nitrogen N (%) 88 Potassium K (meq/100) 23 Phosphorus P (ug/g) 57 Sulfur S (ug/g) 42 Boron B (ug/g) 36 Zinc Zn (ug/g) 56 dtype: int64
# Histogram Plots
num_rows = 7
num_cols = 2
# Create a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15,22))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Iterate over columns and create histograms
for i, col in enumerate(df.columns):
sns.histplot(df[col], kde=True, ax=axes[i])
axes[i].set_xlabel(col)
# Hide any unused subplots
for i in range(len(df.columns), num_rows*num_cols):
fig.delaxes(axes[i])
# Adjust layout
plt.tight_layout()
# Show the plots
plt.show()
# Calculate skewness for all numerical columns
skewness = df.select_dtypes(include=['number']).skew()
# Display the skewness values
print(skewness)
Data Collection Year -0.141388 pH 1.245709 SOC (%) 9.632945 Nitrogen N (%) 24.273522 Potassium K (meq/100) 22.488278 Phosphorus P (ug/g) 2.620437 Sulfur S (ug/g) 1.905179 Boron B (ug/g) 3.437144 Zinc Zn (ug/g) 3.954932 dtype: float64
numeric_df = df.select_dtypes(include=[np.number])
df_corr = numeric_df.corr(method='pearson')
df_corr= df_corr.applymap("{:.3f}".format)
df_corr
| Data Collection Year | pH | SOC (%) | Nitrogen N (%) | Potassium K (meq/100) | Phosphorus P (ug/g) | Sulfur S (ug/g) | Boron B (ug/g) | Zinc Zn (ug/g) | |
|---|---|---|---|---|---|---|---|---|---|
| Data Collection Year | 1.000 | -0.140 | 0.198 | 0.038 | -0.049 | -0.027 | -0.125 | 0.257 | -0.006 |
| pH | -0.140 | 1.000 | -0.241 | 0.038 | 0.098 | -0.087 | 0.158 | 0.092 | 0.129 |
| SOC (%) | 0.198 | -0.241 | 1.000 | 0.023 | -0.050 | -0.139 | 0.059 | 0.072 | -0.015 |
| Nitrogen N (%) | 0.038 | 0.038 | 0.023 | 1.000 | 0.952 | -0.050 | -0.060 | 0.012 | -0.053 |
| Potassium K (meq/100) | -0.049 | 0.098 | -0.050 | 0.952 | 1.000 | 0.054 | 0.026 | 0.033 | 0.089 |
| Phosphorus P (ug/g) | -0.027 | -0.087 | -0.139 | -0.050 | 0.054 | 1.000 | 0.107 | 0.160 | 0.332 |
| Sulfur S (ug/g) | -0.125 | 0.158 | 0.059 | -0.060 | 0.026 | 0.107 | 1.000 | 0.245 | 0.262 |
| Boron B (ug/g) | 0.257 | 0.092 | 0.072 | 0.012 | 0.033 | 0.160 | 0.245 | 1.000 | 0.169 |
| Zinc Zn (ug/g) | -0.006 | 0.129 | -0.015 | -0.053 | 0.089 | 0.332 | 0.262 | 0.169 | 1.000 |
# Correlation Matrix
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(8, 6))
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='viridis_r', center=0, mask=mask, fmt=".2f")
plt.xticks(rotation = 90, fontsize=8)
plt.yticks(fontsize=8)
plt.title('Correlational Heatmap')
plt.show()
# Strongest Positive Correlated Features
numeric_df = df.select_dtypes(include=[np.number])
df_corr = numeric_df.corr(method='pearson')
# Threshold for high correlation
threshold = 0.50
# Extract the pairs of features with correlation values greater than the threshold
corr_pairs = df_corr.unstack()
high_corr = corr_pairs[(corr_pairs > threshold) & (corr_pairs < 1)]
# Drop duplicate pairs by sorting the index and dropping duplicates
high_corr = high_corr.reset_index()
high_corr.columns = ['Feature1', 'Feature2', 'Correlation']
high_corr = high_corr.drop_duplicates(subset=['Correlation'])
# Sort by correlation value in descending order
high_corr = high_corr.sort_values(by='Correlation', ascending=False)
# Display the pairs
high_corr
| Feature1 | Feature2 | Correlation | |
|---|---|---|---|
| 0 | Nitrogen N (%) | Potassium K (meq/100) | 0.951968 |
# Strongest Negative Correlated Features
# Set the threshold for the negative correlation value
threshold = -0.5
# Extract the pairs of features with correlation values between -1 and the threshold
neg_corr_pairs = df_corr.unstack()
high_neg_corr = neg_corr_pairs[(neg_corr_pairs < threshold) & (neg_corr_pairs > -1)]
# Drop duplicate pairs by sorting the index and dropping duplicates
high_neg_corr = high_neg_corr.reset_index()
high_neg_corr.columns = ['Feature1', 'Feature2', 'Correlation']
high_neg_corr = high_neg_corr.drop_duplicates(subset=['Correlation'])
# Filter out self-correlations
high_neg_corr = high_neg_corr[high_neg_corr['Feature1'] != high_neg_corr['Feature2']]
# Sort by correlation value in ascending order
high_neg_corr = high_neg_corr.sort_values(by='Correlation')
# Display the pairs
high_neg_corr
| Feature1 | Feature2 | Correlation |
|---|
df_corr["SOC (%)"].sort_values(ascending=False)
SOC (%) 1.000000 Data Collection Year 0.197989 Boron B (ug/g) 0.071694 Sulfur S (ug/g) 0.058506 Nitrogen N (%) 0.022636 Zinc Zn (ug/g) -0.014694 Potassium K (meq/100) -0.049869 Phosphorus P (ug/g) -0.139034 pH -0.240970 Name: SOC (%), dtype: float64
# sorted_corr values are numeric
sorted_corr = df_corr["SOC (%)"].sort_values(ascending=False).astype(float)
# Create a bar plot
plt.figure(figsize=(5,4))
sns.barplot(x=sorted_corr.values, y=sorted_corr.index, palette="viridis")
plt.title('Correlation with SOC (%)')
plt.xlabel('Correlation Coefficient')
plt.ylabel('')
plt.axvline(x=0, color='grey', linestyle='--')
plt.show()
df.select_dtypes(include=['float64']).columns
Index(['pH', 'SOC (%)', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)'],
dtype='object')
# Create a pairplot - Numerical Variables:
sns.pairplot(df[['pH', 'SOC (%)', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)']])
plt.show()
# Histogram:
plt.figure(figsize=(6, 4))
sns.histplot(data=df, x='SOC (%)', element='step', common_norm=False)
mean = np.mean(df['SOC (%)'])
median = np.median(df['SOC (%)'])
std_deviation = np.std(df['SOC (%)'])
plt.axvline(mean, color='red', linestyle='--')
plt.axvline(median, color='blue', linestyle='--')
plt.axvline(x=std_deviation, color='green', linestyle='--')
# annotate mean,median & Standard deviation
plt.annotate(f'Mean: {mean:.3}', xy=(mean, plt.gca().get_ylim()[1]), xytext=(100, -30),
textcoords='offset points', color='red', fontsize=10)
plt.annotate(f'Median: {median:.3f}', xy=(median, plt.gca().get_ylim()[1]), xytext=(100, -50),
textcoords='offset points', color='blue', fontsize=10)
plt.annotate(f'Standard deviation: {std_deviation:.3f}', xy=(std_deviation, plt.gca().get_ylim()[1]), xytext=(70, -70),
textcoords='offset points', color='green', fontsize=10)
plt.xlabel('SOC (%)')
plt.ylabel('Frequency')
plt.title('Distribution of SOC (%)')
plt.show()
df["SOC (%)"].describe()
count 618.000000 mean 1.792214 std 1.034159 min 0.100000 25% 1.352500 50% 1.700000 75% 2.160000 max 20.400000 Name: SOC (%), dtype: float64
skewness = df["SOC (%)"].skew()
print(f"Skewness: {skewness}")
Skewness: 9.632944630840102
# Create a box plot
plt.figure(figsize=(6, 4))
sns.boxplot(y='SOC (%)', data=df)
plt.ylabel('SOC (%)')
plt.title('Box Plot of SOC (%)')
plt.show()
# Create a violin plot
plt.figure(figsize=(6, 4))
sns.violinplot(y='SOC (%)', data=df)
plt.ylabel('SOC (%)')
plt.title('Violin Plot of Yield')
plt.show()
df.select_dtypes(include=['object']).columns
Index(['Area', 'soil group', 'Land class', 'knit (surface)'], dtype='object')
plt.figure(figsize=(10, 10))
# Define the list of hues
hues = ['Area', 'Land class', 'knit (surface)']
# Loop through each hue
for i, hue in enumerate(hues, 1):
plt.subplot(len(hues), 1, i)
sns.kdeplot(data=df, x="SOC (%)", alpha=0.3, shade=True, hue=hue, palette='viridis')
plt.xlabel('SOC (%)', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.title(f'KDE Plot of SOC (%) by {hue.capitalize()}' , fontsize=12)
plt.tight_layout()
plt.show()
"SOC (%)" and "Area"
# Box Plot for "SOC (%)" and "Area":
custom_colors = px.colors.qualitative.Plotly[:len(df['Area'].unique())]
# Create the box plot using plotly with custom colors
fig = px.box(df, x='Area', y='SOC (%)', title='Box Plot for SOC (%) and Area',
color='Area', color_discrete_sequence=custom_colors)
# Update layout to adjust the figure size
fig.update_layout(width=1000, height=600)
fig.show()
# Average "SOC (%)" for "Area"
# Group data by "Area" and calculate the Average "SOC (%)":
average_SOC = df.groupby("Area")["SOC (%)"].mean().reset_index()
# Sort the data by "SOC (%)" in descending order
average_SOC = average_SOC.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart
fig = px.bar(average_SOC, x="Area", y="SOC (%)",
labels={"Area": "Area", "SOC (%)": "SOC (%)"},
title="Average SOC (%) for each Area",
color="Area",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if needed
# fig.update_xaxes(tickangle=90)
fig.show()
average_SOC
| Area | SOC (%) | |
|---|---|---|
| 3 | Pirgacha | 2.168054 |
| 1 | Kaunia upazila | 1.842222 |
| 2 | Mithpukur | 1.799239 |
| 4 | Taraganj Thana | 1.654021 |
| 0 | Gangachara | 0.974833 |
Highest SOC (%): Pirgacha leads with the highest average SOC (%) at 2.17%, followed by Kaunia upazila (1.84%) and Mithpukur (1.80%).
Lowest SOC (%): Gangachara shows the lowest average SOC (%) among the areas, at 0.97%.
These differences in SOC (%) suggest that Pirgacha has potentially richer organic carbon content, while Gangachara may require soil improvement efforts for enhanced soil quality.
# Percentage of "SOC (%)" for "Area"
# Group data by "Area" and calculate the total and percentage of "SOC (%)"
total_SOC = df["SOC (%)"].sum()
# Calculate the percentage for each Area
df_percentage = df.groupby("Area")["SOC (%)"].sum().reset_index()
df_percentage['SOC (%)'] = (df_percentage['SOC (%)'] / total_SOC) * 100
# Sort the data by "SOC (%)" percentage in descending order
df_percentage = df_percentage.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart using plotly
fig = px.bar(df_percentage, x="Area", y="SOC (%)", color="Area",
labels={"Area": "Area", "SOC (%)": "Percentage of SOC (%)"},
title="Percentage of SOC (%) for each Area",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if necessary
# fig.update_xaxes(tickangle=90)
fig.show()
df_percentage
| Area | SOC (%) | |
|---|---|---|
| 2 | Mithpukur | 34.601133 |
| 3 | Pirgacha | 29.166080 |
| 1 | Kaunia upazila | 16.466412 |
| 4 | Taraganj Thana | 14.485531 |
| 0 | Gangachara | 5.280845 |
"SOC (%)" and "soil group"
# Box Plot for "SOC (%)" and "soil group":
custom_colors = px.colors.qualitative.Plotly[:len(df['soil group'].unique())]
# Create the box plot using plotly with custom colors
fig = px.box(df, x='soil group', y='SOC (%)', title='Box Plot for SOC (%) and soil group',
color='soil group', color_discrete_sequence=custom_colors)
# Update layout to adjust the figure size
fig.update_layout(width=1000, height=600)
fig.show()
# Average "SOC (%)" for "soil group"
# Group data by "soil group" and calculate the Average "SOC (%)":
average_SOC = df.groupby("soil group")["SOC (%)"].mean().reset_index()
# Sort the data by "SOC (%)" in descending order
average_SOC = average_SOC.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart
fig = px.bar(average_SOC, x="soil group", y="SOC (%)",
labels={"soil group": "soil group", "SOC (%)": "SOC (%)"},
title="Average SOC (%) for each soil group",
color="soil group",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if needed
# fig.update_xaxes(tickangle=90)
fig.show()
average_SOC
| soil group | SOC (%) | |
|---|---|---|
| 19 | The militants | 4.310000 |
| 7 | Imadpur | 2.870000 |
| 8 | Indrail | 2.790000 |
| 9 | Kachna | 2.596667 |
| 10 | Kaunia | 2.468889 |
| 11 | Khilgaon | 2.456000 |
| 23 | the pen | 2.380000 |
| 3 | Bhimpur | 2.226000 |
| 6 | Imadpur | 2.000909 |
| 17 | Sahapur | 1.936250 |
| 4 | Chilmari | 1.895000 |
| 5 | Gangachara | 1.882134 |
| 22 | the moon | 1.843889 |
| 15 | Pirgacha | 1.799167 |
| 13 | Noadda | 1.719267 |
| 2 | Berries | 1.614000 |
| 21 | recession | 1.577500 |
| 1 | Berries | 1.507531 |
| 12 | Menanagar | 1.505000 |
| 14 | Palashbari | 1.491364 |
| 20 | belab | 1.428000 |
| 16 | Saghata | 1.195000 |
| 0 | Alluvial soil of Teesta | 0.991795 |
| 18 | Teesta sandy soil | 0.369091 |
Top Soil Groups by Average SOC (%): "The militants" soil group has the highest average SOC, with 4.31%. This is followed by soil groups like "Imadpur" (2.87%), "Indrail" (2.79%), and "Kachna" (2.60%).
Mid-range SOC (%) Groups: Soil groups such as "Kaunia," "Khilgaon," and "the pen" display moderate SOC levels, averaging around 2.38% to 2.46%.
Lowest SOC (%) Groups: "Alluvial soil of Teesta" and "Teesta sandy soil" have the lowest SOC averages, at 0.99% and 0.37%, respectively.
# Percentage of "SOC (%)" for "soil group"
# Group data by "soil group" and calculate the total and percentage of "SOC (%)"
total_SOC = df["SOC (%)"].sum()
# Calculate the percentage for each soil group
df_percentage = df.groupby("soil group")["SOC (%)"].sum().reset_index()
df_percentage['SOC (%)'] = (df_percentage['SOC (%)'] / total_SOC) * 100
# Sort the data by "SOC (%)" percentage in descending order
df_percentage = df_percentage.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart using plotly
fig = px.bar(df_percentage, x="soil group", y="SOC (%)", color="soil group",
labels={"soil group": "soil group", "SOC (%)": "Percentage of SOC (%)"},
title="Percentage of SOC (%) for each soil group",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if necessary
# fig.update_xaxes(tickangle=90)
fig.show()
df_percentage
| soil group | SOC (%) | |
|---|---|---|
| 5 | Gangachara | 40.443558 |
| 10 | Kaunia | 14.043128 |
| 14 | Palashbari | 8.886879 |
| 15 | Pirgacha | 5.847842 |
| 1 | Berries | 4.355500 |
| 0 | Alluvial soil of Teesta | 3.492273 |
| 22 | the moon | 2.996602 |
| 17 | Sahapur | 2.797069 |
| 20 | belab | 2.578576 |
| 13 | Noadda | 2.328393 |
| 6 | Imadpur | 1.987201 |
| 19 | The militants | 1.556535 |
| 2 | Berries | 1.457221 |
| 9 | Kachna | 1.406660 |
| 21 | recession | 1.139413 |
| 11 | Khilgaon | 1.108716 |
| 3 | Bhimpur | 1.004886 |
| 16 | Saghata | 0.863137 |
| 18 | Teesta sandy soil | 0.366562 |
| 4 | Chilmari | 0.342185 |
| 12 | Menanagar | 0.271762 |
| 7 | Imadpur | 0.259122 |
| 8 | Indrail | 0.251899 |
| 23 | the pen | 0.214881 |
Top Contributors: "Gangachara" contributes the highest percentage of SOC (40.44%), followed by "Kaunia" (14.04%) and "Palashbari" (8.89%). Together, these three soil groups make up the majority of SOC in the dataset.
Moderate Contributors: Soil groups such as "Pirgacha," "Berries," and "Alluvial soil of Teesta" contribute between 3-6%, showing moderate SOC shares.
Lowest Contributors: Soil groups like "Teesta sandy soil," "Chilmari," "Menanagar," and others have minimal SOC contributions, under 1%.
"SOC (%)" and "Land class"
# Box Plot for "SOC (%)" and "Land class":
custom_colors = px.colors.qualitative.Plotly[:len(df['Land class'].unique())]
# Create the box plot using plotly with custom colors
fig = px.box(df, x='Land class', y='SOC (%)', title='Box Plot for SOC (%) and Land class',
color='Land class', color_discrete_sequence=custom_colors)
# Update layout to adjust the figure size
fig.update_layout(width=1000, height=600)
fig.show()
# Average "SOC (%)" for "Land class"
# Group data by "Land class" and calculate the Average "SOC (%)":
average_SOC = df.groupby("Land class")["SOC (%)"].mean().reset_index()
# Sort the data by "SOC (%)" in descending order
average_SOC = average_SOC.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart
fig = px.bar(average_SOC, x="Land class", y="SOC (%)",
labels={"Land class": "Land class", "SOC (%)": "SOC (%)"},
title="Average SOC (%) for each Land class",
color="Land class",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if needed
# fig.update_xaxes(tickangle=90)
fig.show()
average_SOC
| Land class | SOC (%) | |
|---|---|---|
| 0 | Deep medium high land | 2.333333 |
| 2 | Medium low land | 2.145200 |
| 1 | Medium high land | 1.890564 |
| 3 | Shallow to medium high land | 1.806667 |
| 4 | high ground | 1.636000 |
Highest SOC: "Deep medium high land" has the highest average SOC at 2.33%, indicating greater soil organic carbon content in this land class.
Moderate SOC: "Medium low land" and "Medium high land" follow with average SOC values of 2.15% and 1.89%, respectively.
Lower SOC: "Shallow to medium high land" and "High ground" have the lowest average SOC, at 1.81% and 1.64%, respectively.
# Percentage of "SOC (%)" for "Land class"
# Group data by "Land class" and calculate the total and percentage of "SOC (%)"
total_SOC = df["SOC (%)"].sum()
# Calculate the percentage for each Land class
df_percentage = df.groupby("Land class")["SOC (%)"].sum().reset_index()
df_percentage['SOC (%)'] = (df_percentage['SOC (%)'] / total_SOC) * 100
# Sort the data by "SOC (%)" percentage in descending order
df_percentage = df_percentage.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart using plotly
fig = px.bar(df_percentage, x="Land class", y="SOC (%)", color="Land class",
labels={"Land class": "Land class", "SOC (%)": "Percentage of SOC (%)"},
title="Percentage of SOC (%) for each Land class",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if necessary
# fig.update_xaxes(tickangle=90)
fig.show()
df_percentage
| Land class | SOC (%) | |
|---|---|---|
| 1 | Medium high land | 54.450752 |
| 4 | high ground | 39.585839 |
| 2 | Medium low land | 4.842053 |
| 0 | Deep medium high land | 0.632004 |
| 3 | Shallow to medium high land | 0.489352 |
Dominance of "Medium high land" and "High ground": "Medium high land" accounts for 54.45% of the total SOC, followed closely by "High ground" at 39.59%. Together, these two land classes contain nearly all SOC in the dataset.
Low SOC Contribution from Other Classes: "Medium low land," "Deep medium high land," and "Shallow to medium high land" contribute minimally, with percentages below 5%.
This distribution contrasts with the previous average results, where "Deep medium high land" showed the highest average SOC (%), though its total SOC contribution remains small relative to other land classes.
This indicates that while some classes may have high average SOC, they represent smaller portions of the land area, resulting in a lower overall SOC contribution.
"SOC (%)" and "knit (surface)"
# Box Plot for "SOC (%)" and "knit (surface)":
custom_colors = px.colors.qualitative.Plotly[:len(df['knit (surface)'].unique())]
# Create the box plot using plotly with custom colors
fig = px.box(df, x='knit (surface)', y='SOC (%)', title='Box Plot for SOC (%) and knit (surface)',
color='knit (surface)', color_discrete_sequence=custom_colors)
# Update layout to adjust the figure size
fig.update_layout(width=1000, height=600)
fig.show()
# Average "SOC (%)" for "knit (surface)"
# Group data by "knit (surface)" and calculate the Average "SOC (%)":
average_SOC = df.groupby("knit (surface)")["SOC (%)"].mean().reset_index()
# Sort the data by "SOC (%)" in descending order
average_SOC = average_SOC.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart
fig = px.bar(average_SOC, x="knit (surface)", y="SOC (%)",
labels={"knit (surface)": "knit (surface)", "SOC (%)": "SOC (%)"},
title="Average SOC (%) for each knit (surface)",
color="knit (surface)",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if needed
# fig.update_xaxes(tickangle=90)
fig.show()
average_SOC
| knit (surface) | SOC (%) | |
|---|---|---|
| 1 | brick | 3.256667 |
| 4 | loam clay | 2.351333 |
| 0 | Clay loam | 2.085382 |
| 3 | loam | 1.851394 |
| 2 | in the sand | 0.930000 |
# Percentage of "SOC (%)" for "knit (surface)"
# Group data by "knit (surface)" and calculate the total and percentage of "SOC (%)"
total_SOC = df["SOC (%)"].sum()
# Calculate the percentage for each Land class
df_percentage = df.groupby("knit (surface)")["SOC (%)"].sum().reset_index()
df_percentage['SOC (%)'] = (df_percentage['SOC (%)'] / total_SOC) * 100
# Sort the data by "SOC (%)" percentage in descending order
df_percentage = df_percentage.sort_values(by="SOC (%)", ascending=False)
# Create the bar chart using plotly
fig = px.bar(df_percentage, x="knit (surface)", y="SOC (%)", color="knit (surface)",
labels={"knit (surface)": "knit (surface)", "SOC (%)": "Percentage of SOC (%)"},
title="Percentage of SOC (%) for each knit (surface)",
color_discrete_sequence=px.colors.sequential.Viridis)
# Set font size for x and y axis labels
fig.update_xaxes(title_font_size=12, tickfont_size=12)
fig.update_yaxes(title_font_size=12, tickfont_size=12)
# Rotate x-axis labels if necessary
# fig.update_xaxes(tickangle=90)
fig.show()
df_percentage
| knit (surface) | SOC (%) | |
|---|---|---|
| 3 | loam | 59.841656 |
| 0 | Clay loam | 14.309382 |
| 4 | loam clay | 3.184397 |
| 1 | brick | 2.646291 |
| 2 | in the sand | 0.251899 |
Loam contributes the most, with nearly 59.84% of the total SOC across all surfaces, which strongly dominates the distribution.
Clay loam has a smaller share, with 14.31% of the total SOC.
Loam clay follows, accounting for about 3.18% of the total SOC.
Brick surfaces contribute 2.65%, and in the sand has the lowest percentage at 0.25%
How does the average SOC (%) vary by Land class within each area?
# Average SOC (%) by Land class for each Area
for Area, group_data in df.groupby('Area'):
avg_SOC = group_data.groupby('Land class')['SOC (%)'].mean().reset_index()
fig = px.bar(avg_SOC, x='Land class', y='SOC (%)',
title=f'Average SOC % by Land class in {Area}',
labels={'SOC (%)': 'SOC (%)', 'Land class': 'Land class'},
color='SOC (%)',
color_continuous_scale=px.colors.sequential.Viridis)
# Update the layout
fig.update_layout(
width=1000,
height=500,
legend_title_text=Area,
xaxis_title='Land class',
yaxis_title='Average SOC (%)')
# Show the plot
fig.show()
Q: How does the average SOC (%) vary by soil group within each area?
# Average SOC (%) by soil group for each Area
for Area, group_data in df.groupby('Area'):
avg_SOC = group_data.groupby('soil group')['SOC (%)'].mean().reset_index()
fig = px.bar(avg_SOC, x='soil group', y='SOC (%)',
title=f'Average SOC % by soil group in {Area}',
labels={'SOC (%)': 'SOC (%)', 'soil group': 'soil group'},
color='SOC (%)',
color_continuous_scale=px.colors.sequential.Viridis)
# Update the layout
fig.update_layout(
width=1000,
height=500,
legend_title_text=Area,
xaxis_title='soil group',
yaxis_title='Average SOC (%)')
# Show the plot
fig.show()
df.select_dtypes(include=['float64']).columns
Index(['pH', 'SOC (%)', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)'],
dtype='object')
# Before Outlier Removal
import matplotlib.ticker as ticker
x_variables = ['pH', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)']
for x_var in x_variables:
plt.figure(figsize=(10, 4))
sns.scatterplot(data=df, x=x_var, y='SOC (%)', hue='Area', palette='viridis')
plt.title(f'Scatter Plot of SOC (%) and {x_var} - Before Outlier Removal')
plt.xlabel(x_var)
plt.ylabel('SOC (%)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()
Handling Outliers
# Calculate the first and third quartile for numerical data
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1
# Create a filter to keep only non-outliers
non_outliers_filter = ~((numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))).any(axis=1)
# Apply the filter to your dataframe
df_no_outliers = df[non_outliers_filter]
# Print the number of rows remaining
print(f"Number of rows after removing outliers: {len(df_no_outliers)}")
Number of rows after removing outliers: 409
print(df.shape)
print(df_no_outliers.shape)
(618, 13) (409, 13)
# After Outlier Removal
import matplotlib.ticker as ticker
x_variables = ['pH', 'Nitrogen N (%)', 'Potassium K (meq/100)',
'Phosphorus P (ug/g)', 'Sulfur S (ug/g)', 'Boron B (ug/g)',
'Zinc Zn (ug/g)']
for x_var in x_variables:
plt.figure(figsize=(10, 4))
sns.scatterplot(data=df_no_outliers, x=x_var, y='SOC (%)', hue='Area', palette='viridis')
plt.title(f'Scatter Plot of SOC (%) and {x_var} - After Outlier Removal')
plt.xlabel(x_var)
plt.ylabel('SOC (%)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()